library(ggplot2)
library(plyr)
library(scales)

ooni.index <- read.csv("ooni-index.csv.xz",
	colClasses=c(date="POSIXct", cc="character", asn="integer", test_name="character"),
	na.strings="--" # "NA" is Namibia.
)
country.codes <- read.csv("country-codes.csv",
	stringsAsFactors=F,
	na.strings="--" # "NA" is North America.
)
hdi <- read.csv("hdi.csv",
	stringsAsFactors=F
)

# We need to merge the country.codes and hdi data sets. country.codes has nice
# ISO-3166 2-letter codes alongside spelled-out country names, but hdi only has
# the spelled-out names, and the spelled-out names are not always exactly the
# same across the two data sets. We need to build a mapping from hdi's
# spelled-out country names to ISO-3166 2-letter code. Most of the cases are
# handled by matching hdi$country with country.codes$official_name_en; then we
# manually fill out the countries that are still missing.
hdi.country.to.code <- country.info <- merge(hdi, country.codes, by.x=c("country"), by.y=c("official_name_en"))[, c("country", "ISO3166.1.Alpha.2")]
hdi.country.to.code <- rbind(hdi.country.to.code,
	c("Congo (Democratic Republic of the)", "CD"),
	c("Czech Republic", "CZ"),
	c("Hong Kong, China (SAR)", "HK"),
	c("Korea (Republic of)", "KR"),
	c("Moldova (Republic of)", "MD"),
	c("Palestine, State of", "PS"),
	c("Tanzania (United Republic of)", "TZ"),
	c("United Kingdom", "GB"),
	c("United States", "US")
)
hdi <- merge(hdi, hdi.country.to.code, by=c("country"), all.x=T)

country.info <- merge(country.codes, hdi, by=c("ISO3166.1.Alpha.2"), all=T)
data <- merge(ooni.index, country.info, by.x=c("cc"), by.y=c("ISO3166.1.Alpha.2"), all.x=T)
data$Continent <- factor(data$Continent,
	levels=c("AF", "NA", "SA", "AN", "AS", "EU", "OC", NA),
	labels=c("Africa", "N.America", "S.America", "Antarctica", "Asia", "Europe", "Oceania", "??"),
	exclude=NULL
)

# Check if anything is missing an HDI because of the uncertain mapping of spelled-out country names.
options(width=120)
print("Countries missing an HDI:")
print(count(data[is.na(data$hdi), c("cc", "name", "official_name_en")]))

plot_num_reports_by_date <- function(data, title) {
	p <- ggplot(data)
	p <- p + geom_bar(aes(as.Date(date)))
	p <- p + scale_x_date(date_breaks="1 year", date_minor_breaks="1 month")
	p <- p + labs(title="Number of OONI reports per day", x="date", y="number of reports")
	p <- p + theme_bw()
	p
}

plot_num_reports_by_date_and_continent <- function(data, title) {
	p <- ggplot(data)
	p <- p + geom_bar(aes(as.Date(date)))
	p <- p + scale_x_date(date_breaks="1 year", date_minor_breaks="1 month")
	p <- p + facet_grid(Continent ~ .)
	p <- p + labs(title=title, x="date", y="number of reports")
	p <- p + theme_bw()
	p
}

plot_num_reports_by_test_name <- function(data, title) {
	p <- ggplot(data)
	p <- p + geom_bar(aes(reorder(test_name, test_name, function(x) length(x))))
	p <- p + coord_flip()
	p <- p + labs(title=title, x=NULL, y="number of reports")
	p <- p + theme_bw()
	p
}

plot_num_reports_by_country <- function(data, title) {
	p <- ggplot(data)
	p <- p + geom_text(aes(x=Continent, group=cc, label=cc), stat="count", alpha=0.8, size=2.5, position=position_jitter(width=0.25))
	p <- p + scale_y_continuous(trans="log10", breaks=trans_breaks("log10", function(x) 10^x), labels=comma)
	p <- p + labs(title=title, x="continent", y="number of reports")
	p <- p + theme_bw()
	p
}

plot_num_reports_by_hdi <- function(data, title) {
	p <- ggplot(data)
	p <- p + geom_text(aes(x=hdi, group=cc, label=cc, color=Continent), stat="count", size=2.5)
	p <- p + scale_y_continuous(trans="log10", breaks=trans_breaks("log10", function(x) 10^x), labels=comma)
	p <- p + scale_color_brewer(palette="Set2")
	p <- p + labs(title=title, x="Human Development Index 2015", y="number of reports")
	p <- p + guides(color=guide_legend(override.aes=list(alpha=1, size=4)))
	p <- p + theme_bw()
	p
}

plot_num_reports_by_date(data[data$date >= as.POSIXct("2015-08-28"), ], "Number of OONI reports per day")
plot_num_reports_by_date_and_continent(data[data$date >= as.POSIXct("2015-08-28"), ], "Number of OONI reports per day, by continent") + coord_cartesian(ylim=c(0, 1250))
plot_num_reports_by_test_name(data, "Number of OONI reports by test name")
plot_num_reports_by_test_name(data[data$date >= as.POSIXct("2018-01-01"), ], "Number of OONI reports by test name (since 2018-01-01)")
plot_num_reports_by_country(data, "Number of OONI reports by country and continent")
plot_num_reports_by_country(data[data$date >= as.POSIXct("2018-01-01"), ], "Number of OONI reports by country and continent (since 2018-01-01)")
plot_num_reports_by_hdi(data, "Number of OONI reports by Human Development Index")
plot_num_reports_by_hdi(data[data$date >= as.POSIXct("2018-01-01"), ], "Number of OONI reports by Human Development Index (since 2018-01-01)")
